rm(list=ls()) # clear workspace

# set working directory
setwd("C:/Users/cailin.slattery/Dropbox/Incentives/postJM/Replication")

library(MASS)
library(ggplot2)
library(ORDER2PARENT)
library(haven)
library(bbmle)
library(psych)
library(mvtnorm)
library(dplyr)
library(stringr)
library(tidyr)
library(copula)
library(tidyverse)

set.seed(31389) 
theme_set(theme_classic())

#-------------------------------------------------------------------------------#
# Helper functions for MLE

# Creates data matrix for varlist + interaction terms
create_mat <- function(data, vars) {
    i <- 1
    for (v in vars) {
        if (grepl(":", v, fixed = TRUE)) {
            x <- str_split(v, ":", simplify = TRUE)
            if (i == 1) {
                X <- data[, x[1]] * data[, x[2]]
            } else {
                X[, v] <- data[, x[1]] * data[, x[2]]
            }
        } else {
            if (i == 1) {
                X <- data[, v]
            } else {
                X[, v] <- data[, v]
            }
        }
        i <- i + 1
    }
    data.matrix(X)
}

# Creates variable names for random coefficients
get_varnames_rand <- function(rand_vars) {
    sapply(rand_vars, function(x) {paste("sigma", x, sep="_")})
}

# Runs linear model with varlist as input
run_lm <- function(data, vars) {
    vars_full <- c(0, vars)
    f <- as.formula(paste("sub_M", paste(vars_full, collapse = " + "), sep = " ~ "))
    lm(f, data=data)
}


#-------------------------------------------------------------------------------#
# Filter data + create dummies

df <- read_stata("data/analysis_cz.dta")
df <- filter(df, ru_limit == 1) # filter to only include CZs with runner-ups

df$manuf_type_1 <- as.integer(df$manuf_type == 1)
df$manuf_type_3 <- as.integer(df$manuf_type == 3)
df <- filter(df, sub_M < 1000)

# sample one runner-up within subsidy ID
set.seed(1)
df <- df %>%
    group_by(id_win) %>%
    slice_sample(n = 1)

#----------------------------------------------------------------------------#
cur_df <- filter(df, manuf==1)

profit_vars <- c("diff_corp_tax", 
                 "diff_income_tax",
                 "diff_proptax",
                 "diff_wage",
                 "diff_occ_top2_naics3",
                 "diff_occ_top2_naics3:jobs_direct",
                 "diff_pr_college:manuf_type_1",
                 "diff_pr_college:manuf_type_3",
                 "diff_perc_est_n4:manuf_type_1",
                 "diff_corp_tax:invest_B",
                 "diff_e_ind:invest_B",
                 "diff_indus_zoning:invest_B",
                 "diff_auto_roadnetwork:manuf_type_3") 
                 #"diff_airport_any:manuf_type_1")

profit_vars_win <- c("corp_tax_win", 
                     "income_tax_win",
                     "proptax_win",
                     "wage_win",
                     "occ_top2_naics3_win",
                     "occ_top2_naics3_win:jobs_direct",
                     "pr_college_win:manuf_type_1",
                     "pr_college_win:manuf_type_3",
                     "perc_est_n4_win:manuf_type_1",
                     "corp_tax_win:invest_B",
                     "e_ind_win:invest_B",
                     "indus_zoning_win:invest_B",
                     "auto_roadnetwork_win:manuf_type_3") 
                    # "airport_any_win:manuf_type_1")

profit_vars_ru <- c("corp_tax", 
                    "income_tax",
                    "proptax",
                    "wage",
                    "occ_top2_naics3",
                    "occ_top2_naics3:jobs_direct",
                    "pr_college:manuf_type_1",
                    "pr_college:manuf_type_3",
                    "perc_est_n4:manuf_type_1",
                    "corp_tax:invest_B",
                    "e_ind:invest_B",
                    "indus_zoning:invest_B",
                    "auto_roadnetwork:manuf_type_3")
                   # "airport_any:manuf_type_1")

v_vars <- c("jobs_direct",
            "mult_jobs",
            "invest_B",
            "income_tax",
            "corp_tax",
            "term_limit",
            "incwage_top_5",
            "unemp",
            "jobs_direct:unemp",
            "ln_pinc",
            "incwage_top_5:ln_pinc")

###########THEN SIMPLIFIED VERSION
X <- create_mat(cur_df, c(profit_vars, v_vars))
na_filter <- rowSums(is.na(X)) == 0
cur_df <- cur_df[na_filter, ]

#-------------------------------------------------------------------------------#
## Calculate profits, welfare, and valutions; output figs + summary stats
vars <- c(profit_vars, v_vars)
lm <- run_lm(cur_df, vars)


# calculate residuals
reg_coef <- coef(lm)[!grepl("sigma", names(coef(lm)))]
X <- create_mat(cur_df, names(reg_coef))
resid <- cur_df$sub_M - (X %*% reg_coef)

profit_vars_lm <- coef(lm)[grepl("diff", names(coef(lm)))]
v_vars_lm <- coef(lm)[!grepl("diff", names(coef(lm)))]

# Win profits
X <- create_mat(cur_df, profit_vars_win)
profits_win <- X %*% profit_vars_lm

# Ru profits
X <- create_mat(cur_df, profit_vars_ru)
profits_ru <- X %*% profit_vars_lm

# Ru valuation
X <- create_mat(cur_df, v_vars)
v_ru <- X %*% v_vars_lm

# subtract min profits + calculate welfare
min_profit_win <- min(profits_win, profits_ru)
profits_win_orig <- profits_win
#280 is min profits for services
profits_win <- profits_win - min_profit_win 
profits_ru_orig <- profits_ru
profits_ru <- profits_ru - min_profit_win 
welfare_ru_orig <- profits_ru_orig + v_ru + resid
welfare_ru <- profits_ru + v_ru + resid
v_ru_resid <- v_ru + resid

# v_ru_cond
v_ru_cond <- v_ru -
    coef(lm)['jobs_direct'] * cur_df$jobs_direct -
    coef(lm)['mult_jobs'] * cur_df$mult_jobs -
    coef(lm)['invest_B'] * cur_df$invest_B +
    coef(lm)['jobs_direct'] * mean(cur_df$jobs_direct) +
    coef(lm)['mult_jobs'] * mean(cur_df$mult_jobs)  +
    coef(lm)['invest_B'] * mean(cur_df$invest_B) 
v_ru_cond_resid = v_ru_cond + resid
v_ru_cond_resid[v_ru_cond_resid<0] <-0

# profit_ru_cond
profits_ru_cond <- profits_ru -
  coef(lm)['diff_occ_top2_naics3:jobs_direct'] * (cur_df$occ_top2_naics3 * cur_df$jobs_direct) +
  coef(lm)['diff_occ_top2_naics3:jobs_direct'] * cur_df$occ_top2_naics3 * mean(cur_df$jobs_direct) -
  coef(lm)['invest_B:diff_indus_zoning'] * (cur_df$indus_zoning * cur_df$invest_B) +
  coef(lm)['invest_B:diff_indus_zoning'] * cur_df$indus_zoning * mean(cur_df$invest_B) -
  coef(lm)['diff_corp_tax:invest_B'] * (cur_df$corp_tax * cur_df$invest_B) +
  coef(lm)['diff_corp_tax:invest_B'] * cur_df$corp_tax * mean(cur_df$invest_B) -
  coef(lm)['invest_B:diff_e_ind'] * (cur_df$e_ind * cur_df$invest_B) +
  coef(lm)['invest_B:diff_e_ind'] * cur_df$e_ind * mean(cur_df$invest_B)

profits_win_cond <- profits_win -
  coef(lm)['diff_occ_top2_naics3:jobs_direct'] * (cur_df$occ_top2_naics3_win * cur_df$jobs_direct) +
  coef(lm)['diff_occ_top2_naics3:jobs_direct'] * cur_df$occ_top2_naics3_win * mean(cur_df$jobs_direct) -
  coef(lm)['invest_B:diff_indus_zoning'] * (cur_df$indus_zoning_win * cur_df$invest_B) +
  coef(lm)['invest_B:diff_indus_zoning'] * cur_df$indus_zoning_win * mean(cur_df$invest_B) -
  coef(lm)['diff_corp_tax:invest_B'] * (cur_df$corp_tax_win * cur_df$invest_B) +
  coef(lm)['diff_corp_tax:invest_B'] * cur_df$corp_tax_win * mean(cur_df$invest_B) -
  coef(lm)['invest_B:diff_e_ind'] * (cur_df$e_ind_win * cur_df$invest_B) +
  coef(lm)['invest_B:diff_e_ind'] * cur_df$e_ind_win * mean(cur_df$invest_B)

## plot ru profits vs. valuations

welfare_ru_cond <- profits_ru_cond + v_ru_cond_resid

##export for figures 
write.csv(profits_ru_cond, "data/profits_norc.csv")

#-------------------------------------------------------------------------------#
#DISTRIBUTION OF WELFARE

#GOING TO USE CONDITIONAL W, AND THEN WE CAN ADD BACK IN 
#welfare: 
w_manuf <- as.data.frame(welfare_ru_cond) #w baseline
colnames(w_manuf) <- c("wbase")

#################################################################### Welfare in all locations 

# WHAT WE HAVE IS WELFARE IN THE RUNNER-UP LOCATION 
# --> SINCE THE AUCTION IS ON WELFARE (winning place has the highest pi+v)
# --> welfare in the runner-up place is the 2nd highest pi+v
# --> result from the auction literature: if we observe 2nd highest bidders w we can get full distribution of w
# --> the thing is for that result, we need the number of bidders


# WE HAVE NUMBER OF BIDDERS IN THE DATA: n_bid_win

w_manuf$nbid <- cur_df$n_bid

ru_m2<- w_manuf[which(w_manuf$nbid==2), 1:2]
ru_m3<- w_manuf[which(w_manuf$nbid==3), 1:2]
ru_m4<- w_manuf[which(w_manuf$nbid==4), 1:2]
ru_m5<- w_manuf[which(w_manuf$nbid==5), 1:2]
ru_m11<- w_manuf[which(w_manuf$nbid>5), 1:2]

t <- seq(-500,1800, by=1)

p2 <- parentcdf(ecdf(ru_m2$wbase)(t), 1,2 )
p3 <- parentcdf(ecdf(ru_m3$wbase)(t), 2,3 )
p4 <- parentcdf(ecdf(ru_m4$wbase)(t), 3,4 )
p5 <- parentcdf(ecdf(ru_m5$wbase)(t), 4,5 )
p11 <- parentcdf(ecdf(ru_m11$wbase)(t), 10,11 )

# now create mixture distribution
Fw <- (p2*length(ru_m2$wbase) + p3*length(ru_m3$wbase) + p4*length(ru_m4$wbase) +p5*length(ru_m5$wbase)  + p11*length(ru_m11$wbase))/length(w_manuf$wbase) 

#################################################################### Copula
#### possible values of v (manufacturing)
Funhat_mix <-  approxfun(t,Fw) # no unobservables 
Funhat_2 <-  approxfun(t,p2) # no unobservables 
Funhat_3 <-  approxfun(t,p3) # no unobservables 
Funhat_4 <-  approxfun(t,p4) # no unobservables 
Funhat_5 <-  approxfun(t,p5) # no unobservables 
Funhat_11 <-  approxfun(t,p11) # no unobservables 

#first, actually, have to calculate dependence parameter

# THIS IS THE CORRELATION BETWEEN RUNNER-UP PROFITS AND RUNNER-UP VALUATIONS
tau_base <- cor(profits_ru_cond, v_ru_cond_resid, method = "kendall")

# translate to the AMH dependence parameter 
d <- uniroot(function(x) ((3*x -2)/(3*x)) - (((2*(1 - x)^2)*log(1 - x))/(3*x^2)) - tau_base, interval=c(-1,-.00001))
d <- d$root


#joint density of copula
cop <- function(Hv, Hp, d) {
   (1+ d*((1+Hv)*(1+Hp)-3) + (d^2)* ((1-Hv)*(1-Hp)))/((1-(d*(1-Hv)*(1-Hp)))^3)
}

#Simulate values for pi, Hp 
#######################################################################################################
#profit distribution is given by calculated profits using beta-hat, sigma
#therefore, just going to use the simulated random coef profits, but drop <0 
manufpi <- c(profits_ru, profits_win)

#For each t, find Hv that minimizes difference between Hv_input and Hv_predicted
#######################################################################################################

t <- seq(-300,1200, by=5)
Hv_input <- seq(0,1, by=.01)

cond_cop <- function(hpi, hv, d) {
  (hpi*(1-d*(1-hpi)))/((1-d*(1-hv)*(1-hpi))^2)
}
H_pi<- ecdf(manufpi)
#H_pi2<- ecdf(manufpi_base)
#create grid of guesses for Hv
#for each guess Hv^g, draw pi from \hat{C}, S times 

p <- seq(-400,1000, by=1)
H_pi_sim <- 0
diff <- 0 
u <- runif(1000)
# DO FOR EACH NUMBER OF BIDDERS

Hv_solve=0
for (y in 1:length(t)) { #possible values of v
  Hv_predicted=0
  for (z in 1:101) { #possible values of Hv 
    #first need to simulate Hpi
    H_pi_sim <- cond_cop(H_pi(p),Hv_input[z],d)
    Hp_sim <- approxfun(p, H_pi_sim)
    InvHpi = approxfun(Hp_sim(p),p)
    profits <- InvHpi(u)
    profits[which(is.na(profits))] <- -200
    tw <- t[y] + profits  
    tw[which(tw> 1800)]<- 1800
    tw[which(tw< -500)]<- -500
    Hv_predicted[z] <- sum(Funhat_mix(tw))/length(tw)
    diff[z] <- abs(Hv_input[z] - Hv_predicted[z])
  }
    Hv_solve[y] <- Hv_predicted[[which.min(diff)]]
}

############################################################## Figures to show the distributions
####Manufacturing
x<- runif(10000, min=.1, max=.9995)
H_manuf <- approxfun(Hv_solve,t)
sample <-  H_manuf(x)
sample_fit <-  H_manuf(x)
sample_fit[which(sample_fit<=0)] <- 1
fit <- fitdistr(na.exclude(sample_fit), "lognormal")
sample_ln <- rlnorm(10000, fit$estimate['meanlog'], fit$estimate['sdlog'])

#################################################### COUNTERFACTUAL AND MODEL FIT 

#################################################################################
#-------------------------------------------------------------------------------#
# 1. Create sampled shortlist: 
#################################################################################
df <- read_stata("data/analysis_cz_long.dta")

#create dummies 

df$manuf_type_1 <- as.integer(df$manuf_type == 1)
df$manuf_type_3 <- as.integer(df$manuf_type == 3)

#drop outlier 
df <- filter(df, sub_M < 1000, df$manuf==1) 

df_sample <- filter(df, runnerup==1 | winner==1 | pop>200000 )
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | occ_top2_naics3>1)
df_sample <- df_sample %>% group_by(id_deal) %>% arrange(id_deal, desc(perc_est_n4))
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | row_number()<=100 )
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 |  row_number()<=60 | r2w==1)
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | airport_any==1 | row_number()<=40 | r2w==1)
df_sample <- df_sample %>% ungroup() %>% mutate(n_bid = replace(n_bid, n_bid>15, 15))

df_shortlist <- df_sample %>% crossing(sim = c(1:50)) %>%  #this expands dataset to simulate over shortlists
  group_by(row_number()) %>% mutate(nsim=rnorm(1,0,1)) %>% ungroup() %>% #create random variable for sorting
  mutate(id_deal_sim = id_deal*100 + sim) %>% #new id_deal var, to track simulation number
  group_by(id_deal_sim) %>% # group by id_deal and simulation 
  arrange(id_deal_sim, desc(winner), desc(runnerup), nsim) %>% #sort 
  filter(row_number()<=n_bid) %>% select(-nsim) #only as many rows as there are bidders for each simulation 


#################################################################################
#-------------------------------------------------------------------------------#
# 2. PREDICT PROFITS  
#################################################################################
  
X <- create_mat(df_shortlist, profit_vars_ru)
df_shortlist$profits <- X %*% profit_vars_lm - min_profit_win 

# probably want to winsorize here. 
df_shortlist$profits <- winsor(df_shortlist$profits, trim=.01)


#################################################################################
#-------------------------------------------------------------------------------#
# 2. PICK WINNERS IN CF (profit only), MODEL FIT (simulated subsidy game) 
#################################################################################

#figure out quantile of each pi, and then map to quantile of v, simulate from copula. 

#empirical CDF of pi: 
H_pi1<- ecdf(df_shortlist$profits)

#given any value of pi, will give the location in the distribution 
#winning locations must have weakly positive profits
df_shortlist <- df_shortlist %>% mutate(profits=ifelse(profits<0 & winner==1, 0, profits)) %>%
  mutate(Hpi1 = H_pi1(profits)) 

#winners in data: 
df_shortlist_winners <- df_shortlist %>% filter(winner==1)
win_loc <- df_shortlist_winners %>% group_by(id_deal, winner, stateabbrev, sub_M) %>% 
  summarize(profit_win = mean(profits))
ru_loc <- df_shortlist %>% ungroup() %>% filter(runnerup==1) %>% select(all_of(c("runnerup", "id_deal", "stateabbrev"))) %>% distinct()

#################################################################################
# SUBSIDY GAME WINNER
#################################################################################
#OK, now for each simulated location, need to simulate valuation, given pi

#this is relationship between pi and v
cop_amh <- amhCopula(d, dim=2)

cf_vars <- c("id_deal", "id_deal_sim", "profits", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M")

#here we simulate v for each simulated pi (pi w and without random coef.)
sl_sim_v <- df_shortlist %>% select(all_of(cf_vars))
sl_sim_v <- sl_sim_v %>% ungroup() %>%  group_by(row_number()) %>% mutate(u = runif(1)) %>% ungroup() 
sl_sim_v <- sl_sim_v  %>% 
  mutate(Hv1 = cCopula(cbind(Hpi1, u), copula=cop_amh, indices=2, inverse=TRUE)) %>%
  mutate(v1 = quantile(sample_ln[which(sample_ln<1500)], Hv1, names=FALSE)) 

sl_sim_v <- sl_sim_v %>% mutate(v1=ifelse(winner==1 & v1<.75*sub_M, .75*sub_M, v1) )

#now choose winning location in each subsidy competition 
sim_win <- sl_sim_v %>% mutate(w = v1+profits) %>%
  group_by(id_deal_sim) %>% arrange(id_deal, id_deal_sim, desc(w)) %>%
  mutate(sim_sub = lead(w)-profits)  %>% filter(row_number()==1)
win_v <- sim_win %>% filter(winner==1) %>% ungroup() %>% group_by(id_deal, stateabbrev) %>%
  arrange(id_deal, stateabbrev) %>% summarize(v_win = mean(v1))
#################################################################################
# COUNTERFACTUAL WINNER
################################################################################
cf_vars <- c("id_deal", "id_deal_sim", "profits", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M", "v1")

#winner in counterfactual: 
df_shortlist_cf_win <- sl_sim_v %>% group_by(id_deal_sim) %>% 
  arrange(id_deal, id_deal_sim, desc(profits), desc(winner)) %>% 
  select(all_of(cf_vars)) %>% filter(row_number()==1)

# #################################################################################
# # MODEL FIT
# ################################################################################
# 
# #don't always pick the right place when simulate (with subsidies)
# #here percent is by the number of simulations
sim_location_id <- sim_win %>% group_by(id_deal, stateabbrev) %>% summarize(no_obs_sim = n()) %>% 
   mutate(perc_loc_sim = 100*no_obs_sim/50) %>% mutate(adjustment = 1-perc_loc_sim/100)
# 
# #basically want to merge this in with the counterfactual winners to make the adjustment: 
# #also going to track and scale the valuations appropriately for the counterfactual table ! 
# 
 sim_cf_win <- df_shortlist_cf_win %>% group_by(id_deal, stateabbrev, winner) %>%  
   summarize(no_obs_cf = n(), mean_v = mean(v1), mean_pi = mean(profits))
sim_location_cf <- merge(sim_cf_win , sim_location_id, by=c("id_deal", "stateabbrev"), all.x=TRUE) 
# #make sure each deal has the winning location as an option, and no need for adjustment for winners: 
sim_location_cf <- merge(sim_location_cf, win_loc, by=c("id_deal", "stateabbrev", "winner"), all.x=TRUE, all.y=TRUE) %>%
   mutate(adjustment = ifelse(winner==1,1,adjustment)) 
# #if missing in model simulation then no need for adjustment:
 sim_location_cf$adjustment[is.na(sim_location_cf$adjustment)] <- 1
# 
 sim_location_cf_summary <- sim_location_cf %>%  mutate(no_obs_cf_adj = adjustment*no_obs_cf) %>%
   group_by(id_deal, winner) %>% arrange(id_deal, desc(winner)) %>% 
   summarize(sum_adj = sum(no_obs_cf_adj), mean_v = mean(mean_v), mean_pi = mean(mean_pi)) 
 share_winner <- sim_location_cf_summary %>% arrange(id_deal, desc(winner)) %>% 
   mutate(share = 100*(50-lead(sum_adj))/50) %>% filter(row_number()==1)
# 
share_winner$share[is.na(share_winner$share)] <- 100  
 share_winner <- merge(share_winner, win_loc, by=c("id_deal", "winner")) %>% select(-c("sum_adj"))
 share_winner <- merge(share_winner, win_v, by=c("id_deal", "stateabbrev"), all.x=TRUE) 
#
# #now will append with the runners-up and non-runners-up/winners 
 share_other <- sim_location_cf %>%  mutate(share = 100*adjustment*no_obs_cf/50)  %>% filter(winner==0) %>% select(-c("winner"))
 share_other <- merge(share_other, ru_loc, by=c("id_deal", "stateabbrev"), all.x=TRUE) %>% 
   select(all_of(c("id_deal", "runnerup", "stateabbrev", "share", "mean_v", "mean_pi")))
 share_other$runnerup[is.na(share_other$runnerup)] <- 0
# 
cf_loc <- bind_rows(share_winner, share_other) %>% group_by(id_deal) %>% arrange(id_deal)
cf_loc$runnerup[is.na(cf_loc$runnerup)] <- 0
cf_loc$winner[is.na(cf_loc$winner)] <- 0
cf_loc <- cf_loc %>% mutate(mean_v=ifelse(is.na(mean_v),v_win, mean_v))  %>% 
   mutate(mean_pi=ifelse(is.na(mean_pi),profit_win, mean_pi))  %>%
  mutate(adj_v = mean_v*(share/100)) %>%  mutate(adj_pi = mean_pi*(share/100))
 write.csv(cf_loc, "data/cf_locations_norc.csv")
 
 #-------------------------------------------------------------------------------#
 # SERVICES
 #-------------------------------------------------------------------------------#
 # Filter data + create dummies
 
 df <- read_stata("data/analysis_cz.dta")
 df <- filter(df, ru_limit == 1) # filter to only include CZs with runner-ups
 df <- filter(df, sub_M < 400)
 df$trade_0 <- as.integer(df$trade == 0)
 df$trade_1 <- as.integer(df$trade == 1)
 
 df$lowserv_0 <- as.integer(df$low_serv == 0)
 df$lowserv_1 <- as.integer(df$low_serv == 1)
 
 # sample one runner-up within subsidy ID
 set.seed(1)
 df <- df %>%
   group_by(id_win) %>%
   slice_sample(n = 1)
 
 cur_df <- filter(df, manuf==0)
 
 #----------------------------------------------------------------------------#
 
 
 profit_vars <- c("diff_corp_tax", 
                  "diff_income_tax",
                  "diff_proptax",
                  "diff_r2w",
                  "diff_FHFA",
                  "diff_e_comm",
                  "diff_auto_roadnetwork",
                  "diff_incwage_top_5:lowserv_0",
                  "diff_incwage_top_5:lowserv_1",
                  "diff_FHFA:invest_B",
                  "diff_airport_any:trade_1",
                  "diff_top_RD:trade_0",
                  "diff_top_RD:trade_1",
                  "diff_income_tax:jobs_direct")
 
 profit_vars_win <- c("corp_tax_win", 
                      "income_tax_win",
                      "proptax_win",
                      "r2w_win",
                      "FHFA_win",
                      "e_comm_win",
                      "auto_roadnetwork_win",
                      "incwage_top_5_win:lowserv_0",
                      "incwage_top_5_win:lowserv_1",
                      "FHFA_win:invest_B",
                      "airport_any_win:trade_1",
                      "top_RD_win:trade_0",
                      "top_RD_win:trade_1",
                      "income_tax_win:jobs_direct")
 
 profit_vars_ru <- c("corp_tax", 
                     "income_tax",
                     "proptax",
                     "r2w",
                     "FHFA",
                     "e_comm",
                     "auto_roadnetwork",
                     "incwage_top_5:lowserv_0",
                     "incwage_top_5:lowserv_1",
                     "FHFA:invest_B",
                     "airport_any:trade_1",
                     "top_RD:trade_0",
                     "top_RD:trade_1",
                     "income_tax:jobs_direct")
 
 
 v_vars <- c("jobs_direct",
             "mult_total",
             "invest_B",
             "income_tax",
             "corp_tax",
             "sales_tax",
             "proptax",
             "term_limit",
             "unemp",
             "incwage_top_5",
             "ln_pinc")
 
 ###########THEN SIMPLIFIED VERSION
 X <- create_mat(cur_df, c(profit_vars, v_vars))
 na_filter <- rowSums(is.na(X)) == 0
 cur_df <- cur_df[na_filter, ]
 
 #-------------------------------------------------------------------------------#
 ## Calculate profits, welfare, and valutions; output figs + summary stats
 vars <- c(profit_vars, v_vars)
 lm <- run_lm(cur_df, vars)
 
 
 # calculate residuals
 reg_coef <- coef(lm)[!grepl("sigma", names(coef(lm)))]
 X <- create_mat(cur_df, names(reg_coef))
 resid <- cur_df$sub_M - (X %*% reg_coef)
 
 profit_vars_lm <- coef(lm)[grepl("diff", names(coef(lm)))]
 v_vars_lm <- coef(lm)[!grepl("diff", names(coef(lm)))]
 
 # Win profits
 X <- create_mat(cur_df, profit_vars_win)
 profits_win <- X %*% profit_vars_lm
 
 # Ru profits
 X <- create_mat(cur_df, profit_vars_ru)
 profits_ru <- X %*% profit_vars_lm
 
 # Ru valuation
 X <- create_mat(cur_df, v_vars)
 v_ru <- X %*% v_vars_lm
 
 # subtract min profits + calculate welfare
 min_profit_win <- min(profits_win, profits_ru)
 profits_win_orig <- profits_win
 #280 is min profits for services
 profits_win <- profits_win - min_profit_win 
 profits_ru_orig <- profits_ru
 profits_ru <- profits_ru - min_profit_win 
 welfare_ru_orig <- profits_ru_orig + v_ru + resid
 welfare_ru <- profits_ru + v_ru + resid
 v_ru_resid <- v_ru + resid
 
 # v_ru_cond
 v_ru_cond <- v_ru -
   coef(lm)['jobs_direct'] * cur_df$jobs_direct -
   coef(lm)['mult_total'] * cur_df$mult_total -
   coef(lm)['invest_B'] * cur_df$invest_B +
   coef(lm)['jobs_direct']  * mean(cur_df$jobs_direct)  +
   coef(lm)['mult_total'] * mean(cur_df$mult_total) +
   coef(lm)['invest_B'] * mean(cur_df$invest_B) 
 v_ru_cond_resid =winsor(v_ru_cond + resid, trim=.01)
 
 # profit_ru_cond
 profits_ru_cond <- profits_ru -
   coef(lm)['diff_income_tax:jobs_direct'] * (cur_df$income_tax * cur_df$jobs_direct) +
   coef(lm)['diff_income_tax:jobs_direct'] * cur_df$income_tax * mean(cur_df$jobs_direct)  -
   coef(lm)['diff_FHFA:invest_B'] * (cur_df$FHFA * cur_df$invest_B) +
   coef(lm)['diff_FHFA:invest_B'] * cur_df$FHFA * mean(cur_df$invest_B)
 
 profits_win_cond <- profits_win -
   coef(lm)['diff_income_tax:jobs_direct'] * (cur_df$income_tax_win * cur_df$jobs_direct) +
   coef(lm)['diff_income_tax:jobs_direct'] * cur_df$income_tax_win  * mean(cur_df$jobs_direct)  -
   coef(lm)['diff_FHFA:invest_B'] * (cur_df$FHFA_win * cur_df$invest_B) +
   coef(lm)['diff_FHFA:invest_B'] * cur_df$FHFA_win * mean(cur_df$invest_B)
 ## plot ru profits vs. valuations
 
 welfare_ru_cond <- profits_ru_cond + v_ru_cond_resid
 
 #-------------------------------------------------------------------------------#
 #DISTRIBUTION OF WELFARE
 
 #GOING TO USE CONDITIONAL W, AND THEN WE CAN ADD BACK IN 
 #welfare: 
 w_serv <- as.data.frame(welfare_ru_cond) #w baseline
 colnames(w_serv) <- c("wbase")
 
 #################################################################### Welfare in all locations 
 
 # WHAT WE HAVE IS WELFARE IN THE RUNNER-UP LOCATION 
 # --> SINCE THE AUCTION IS ON WELFARE (winning place has the highest pi+v)
 # --> welfare in the runner-up place is the 2nd highest pi+v
 # --> result from the auction literature: if we observe 2nd highest bidders w we can get full distribution of w
 # --> the thing is for that result, we need the number of bidders
 
 
 # WE HAVE NUMBER OF BIDDERS IN THE DATA: n_bid_win
 
 w_serv$nbid <- cur_df$n_bid
 
 ru_m2<- w_serv[which(w_serv$nbid==2), 1:2]
 ru_m3<- w_serv[which(w_serv$nbid==3), 1:2]
 ru_m4<- w_serv[which(w_serv$nbid==4), 1:2]
 ru_m5<- w_serv[which(w_serv$nbid==5), 1:2]
 ru_m11<- w_serv[which(w_serv$nbid>5), 1:2]
 
 t <- seq(-500,1500, by=1)
 
 p2 <- parentcdf(ecdf(ru_m2$wbase)(t), 1,2 )
 p3 <- parentcdf(ecdf(ru_m3$wbase)(t), 2,3 )
 p4 <- parentcdf(ecdf(ru_m4$wbase)(t), 3,4 )
 p5 <- parentcdf(ecdf(ru_m5$wbase)(t), 4,5 )
 p11 <- parentcdf(ecdf(ru_m11$wbase)(t), 10,11 )
 
 # now create mixture distribution
 Fw <- (p2*length(ru_m2$wbase) + p3*length(ru_m3$wbase) + p4*length(ru_m4$wbase) +p5*length(ru_m5$wbase)  + p11*length(ru_m11$wbase))/length(w_serv$wbase) 
 
 #################################################################### Copula
 #### possible values of v (manufacturing)
 Funhat_mix <-  approxfun(t,Fw) # no unobservables 
 Funhat_2 <-  approxfun(t,p2) # no unobservables 
 Funhat_3 <-  approxfun(t,p3) # no unobservables 
 Funhat_4 <-  approxfun(t,p4) # no unobservables 
 Funhat_5 <-  approxfun(t,p5) # no unobservables 
 Funhat_11 <-  approxfun(t,p11) # no unobservables 
 
 #first, actually, have to calculate dependence parameter
 
 # THIS IS THE CORRELATION BETWEEN RUNNER-UP PROFITS AND RUNNER-UP VALUATIONS
 tau_base <- cor(profits_ru_cond, v_ru_cond_resid, method = "kendall")
 
 # translate to the AMH dependence parameter 
 d <- uniroot(function(x) ((3*x -2)/(3*x)) - (((2*(1 - x)^2)*log(1 - x))/(3*x^2)) - tau_base, interval=c(-2,-.00001))
 d <- max(-1,d$root)
 
 
 #joint density of copula
 cop <- function(Hv, Hp, d) {
   (1+ d*((1+Hv)*(1+Hp)-3) + (d^2)* ((1-Hv)*(1-Hp)))/((1-(d*(1-Hv)*(1-Hp)))^3)
 }
 
 #Simulate values for pi, Hp 
 #######################################################################################################
 #profit distribution is given by calculated profits using beta-hat, sigma
 #therefore, just going to use the simulated random coef profits, but drop <0 
 servpi <- c(profits_ru, profits_win)
 
 #For each t, find Hv that minimizes difference between Hv_input and Hv_predicted
 #######################################################################################################
 
 
 t <- seq(-300,1200, by=5)
 Hv_input <- seq(0,1, by=.01)
 
 cond_cop <- function(hpi, hv, d) {
   (hpi*(1-d*(1-hpi)))/((1-d*(1-hv)*(1-hpi))^2)
 }
 H_pi<- ecdf(servpi)
 
 #creat grid of guesses for Hv
 #for each guess Hv^g, draw pi from \hat{C}, S times 
 
 p <- seq(-400,1000, by=1)
 H_pi_sim <- 0
 diff <- 0 
 u <- runif(1000)
 # DO FOR EACH NUMBER OF BIDDERS
 
 Hv_solve=0
 for (y in 1:length(t)) { #possible values of v
   Hv_predicted=0
   for (z in 1:101) { #possible values of Hv 
     #first need to simulate Hpi
     H_pi_sim <- cond_cop(H_pi(p),Hv_input[z],d)
     Hp_sim <- approxfun(p, H_pi_sim)
     InvHpi = approxfun(Hp_sim(p),p)
     profits <- InvHpi(u)
     tw <- t[y] + profits  
     tw[which(tw> 1500)]<- 1500
     tw[which(tw< -500)]<- -500
     Hv_predicted[z] <- sum(Funhat_mix(tw))/length(tw)
     diff[z] <- abs(Hv_input[z] - Hv_predicted[z])
   }
   Hv_solve[y] <- Hv_predicted[[which.min(diff)]]
 }
 
 
 x<- runif(10000, min=.01, max=.9999)
 H_serv <- approxfun(Hv_solve,t)
 sample <-  H_serv(x)
 sample_fit <-  na.omit(sample)
 sample_fit[which(sample_fit<=0)] <- 1
 fit <- fitdistr(sample_fit, "lognormal")
 sample_ln <- rlnorm(10000, fit$estimate['meanlog'], fit$estimate['sdlog'])
 summary(sample)
 summary(sample_ln[which(sample_ln<1000)])
 
 
 
 #################################################### COUNTERFACTUAL AND MODEL FIT 
 
 #################################################################################
 #-------------------------------------------------------------------------------#
 # 1. Create sampled shortlist: 
 #################################################################################
 df <- read_stata("data/analysis_cz_long.dta")
 
 #drop outlier 
 df <- filter(df, sub_M < 400, df$manuf==0) 
 df$trade_0 <- as.integer(df$trade == 0)
 df$trade_1 <- as.integer(df$trade == 1)
 
 
 df$lowserv_0 <- as.integer(df$low_serv == 0)
 df$lowserv_1 <- as.integer(df$low_serv == 1)
 
 
 df_sample <- filter(df, runnerup==1 | winner==1 |  pop>200000)
 df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | occ_top2_naics3>1)
 df_sample <- df_sample %>% group_by(id_deal) %>% arrange(id_deal, desc(perc_est_n4))
 df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | airport_any==1 | row_number()<=50 )
 df_sample <- df_sample %>% ungroup() %>% mutate(n_bid = replace(n_bid, n_bid>15, 15))
 
 df_shortlist <- df_sample %>% crossing(sim = c(1:50)) %>%  #this expands dataset to simulate over shortlists
   group_by(row_number()) %>% mutate(nsim=rnorm(1,0,1)) %>% ungroup() %>% #create random variable for sorting
   mutate(id_deal_sim = id_deal*100 + sim) %>% #new id_deal var, to track simulation number
   group_by(id_deal_sim) %>% # group by id_deal and simulation 
   arrange(id_deal_sim, desc(winner), desc(runnerup), nsim) %>% #sort 
   filter(row_number()<=n_bid) %>% select(-nsim) #only as many rows as there are bidders for each simulation 
 
 
 #################################################################################
 #-------------------------------------------------------------------------------#
 # 2. PREDICT PROFITS  
 #################################################################################
 
 X <- create_mat(df_shortlist, profit_vars_ru)
 df_shortlist$profits <- X %*% profit_vars_lm - min_profit_win 
 
 # probably want to winsorize here. 
 df_shortlist$profits <- winsor(df_shortlist$profits, trim=.01)
 
 
 #################################################################################
 #-------------------------------------------------------------------------------#
 # 2. PICK WINNERS IN CF (profit only), MODEL FIT (simulated subsidy game) 
 #################################################################################
 
 #figure out quantile of each pi, and then map to quantile of v, simulate from copula. 
 
 #empirical CDF of pi: 
 H_pi1<- ecdf(df_shortlist$profits)
 
 #given any value of pi, will give the location in the distribution 
 #winning locations must have weakly positive profits
 df_shortlist <- df_shortlist %>% mutate(profits=ifelse(profits<0 & winner==1, 0, profits)) %>%
   mutate(Hpi1 = H_pi1(profits)) 
 
 #winners in data: 
 df_shortlist_winners <- df_shortlist %>% filter(winner==1)
 win_loc <- df_shortlist_winners %>% group_by(id_deal, winner, stateabbrev, sub_M) %>% 
   summarize(profit_win = mean(profits))
 ru_loc <- df_shortlist %>% ungroup() %>% filter(runnerup==1) %>% select(all_of(c("runnerup", "id_deal", "stateabbrev"))) %>% distinct()
 
 #################################################################################
 # SUBSIDY GAME WINNER
 #################################################################################
 #OK, now for each simulated location, need to simulate valuation, given pi
 
 #this is relationship between pi and v
 cop_amh <- amhCopula(d, dim=2)
 
 cf_vars <- c("id_deal", "id_deal_sim", "profits", 
              "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M")
 
 #here we simulate v for each simulated pi (pi w and without random coef.)
 sl_sim_v <- df_shortlist %>% select(all_of(cf_vars))
 sl_sim_v <- sl_sim_v %>% ungroup() %>%  group_by(row_number()) %>% mutate(u = runif(1)) %>% ungroup() 
 sl_sim_v <- sl_sim_v  %>% 
   mutate(Hv1 = cCopula(cbind(Hpi1, u), copula=cop_amh, indices=2, inverse=TRUE)) %>%
   mutate(v1 = quantile(sample_ln[which(sample_ln<1000)], Hv1, names=FALSE)) 
 
 sl_sim_v <- sl_sim_v %>% mutate(v1=ifelse(winner==1 & v1<.75*sub_M, .75*sub_M, v1) )
 
 #now choose winning location in each subsidy competition 
 sim_win <- sl_sim_v %>% mutate(w = v1+profits) %>%
   group_by(id_deal_sim) %>% arrange(id_deal, id_deal_sim, desc(w)) %>%
   mutate(sim_sub = lead(w)-profits)  %>% filter(row_number()==1)
 win_v <- sim_win %>% filter(winner==1) %>% ungroup() %>% group_by(id_deal, stateabbrev) %>%
   arrange(id_deal, stateabbrev) %>% summarize(v_win = mean(v1))
 #################################################################################
 # COUNTERFACTUAL WINNER
 ################################################################################
 cf_vars <- c("id_deal", "id_deal_sim", "profits", 
              "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M", "v1")
 
 #winner in counterfactual: 
 df_shortlist_cf_win <- sl_sim_v %>% group_by(id_deal_sim) %>% 
   arrange(id_deal, id_deal_sim, desc(profits), desc(winner)) %>% 
   select(all_of(cf_vars)) %>% filter(row_number()==1)
 
 # #################################################################################
 # # MODEL FIT
 # ################################################################################
 # 
 # #don't always pick the right place when simulate (with subsidies)
 # #here percent is by the number of simulations
 sim_location_id <- sim_win %>% group_by(id_deal, stateabbrev) %>% summarize(no_obs_sim = n()) %>% 
   mutate(perc_loc_sim = 100*no_obs_sim/50) %>% mutate(adjustment = 1-perc_loc_sim/100)
 # 
 # #basically want to merge this in with the counterfactual winners to make the adjustment: 
 # #also going to track and scale the valuations appropriately for the counterfactual table ! 
 # 
 sim_cf_win <- df_shortlist_cf_win %>% group_by(id_deal, stateabbrev, winner) %>%  
   summarize(no_obs_cf = n(), mean_v = mean(v1), mean_pi = mean(profits))
 sim_location_cf <- merge(sim_cf_win , sim_location_id, by=c("id_deal", "stateabbrev"), all.x=TRUE) 
 # #make sure each deal has the winning location as an option, and no need for adjustment for winners: 
 sim_location_cf <- merge(sim_location_cf, win_loc, by=c("id_deal", "stateabbrev", "winner"), all.x=TRUE, all.y=TRUE) %>%
   mutate(adjustment = ifelse(winner==1,1,adjustment)) 
 # #if missing in model simulation then no need for adjustment:
 sim_location_cf$adjustment[is.na(sim_location_cf$adjustment)] <- 1
 # 
 sim_location_cf_summary <- sim_location_cf %>%  mutate(no_obs_cf_adj = adjustment*no_obs_cf) %>%
   group_by(id_deal, winner) %>% arrange(id_deal, desc(winner)) %>% 
   summarize(sum_adj = sum(no_obs_cf_adj), mean_v = mean(mean_v), mean_pi = mean(mean_pi)) 
 share_winner <- sim_location_cf_summary %>% arrange(id_deal, desc(winner)) %>% 
   mutate(share = 100*(50-lead(sum_adj))/50) %>% filter(row_number()==1)
 # 
 share_winner$share[is.na(share_winner$share)] <- 100  
 share_winner <- merge(share_winner, win_loc, by=c("id_deal", "winner")) %>% select(-c("sum_adj"))
 share_winner <- merge(share_winner, win_v, by=c("id_deal", "stateabbrev"), all.x=TRUE) 
 #
 # #now will append with the runners-up and non-runners-up/winners 
 share_other <- sim_location_cf %>%  mutate(share = 100*adjustment*no_obs_cf/50)  %>% filter(winner==0) %>% select(-c("winner"))
 share_other <- merge(share_other, ru_loc, by=c("id_deal", "stateabbrev"), all.x=TRUE) %>% 
   select(all_of(c("id_deal", "runnerup", "stateabbrev", "share", "mean_v", "mean_pi")))
 share_other$runnerup[is.na(share_other$runnerup)] <- 0
 # 
 cf_loc <- bind_rows(share_winner, share_other) %>% group_by(id_deal) %>% arrange(id_deal)
 cf_loc$runnerup[is.na(cf_loc$runnerup)] <- 0
 cf_loc$winner[is.na(cf_loc$winner)] <- 0
 cf_loc <- cf_loc %>% mutate(mean_v=ifelse(is.na(mean_v),v_win, mean_v))  %>% 
   mutate(mean_pi=ifelse(is.na(mean_pi),profit_win, mean_pi))  %>%
   mutate(adj_v = mean_v*(share/100)) %>%  mutate(adj_pi = mean_pi*(share/100))
 write.csv(cf_loc, "data/cf_locations_serv_norc.csv")
 
 